Rmarkdown to pre-process scRNA seq from monomer stimulated PBMCs.

Create the Seurat Object.

.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.2.0 but the current version is
## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/aM/outs/per_sample_outs/aM/count/sample_filtered_feature_bc_matrix")

pbmc.m <- CreateSeuratObject(counts = pbmc.data, project = "pbmc_m", min.cells = 3, min.features = 200)
pbmc.m
## An object of class Seurat 
## 24658 features across 18951 samples within 1 assay 
## Active assay: RNA (24658 features, 0 variable features)
##  1 layer present: counts

Lets check how many cells and features we are starting with.

length(colnames(pbmc.m)) #number of cells
## [1] 18951
length(rownames(pbmc.m)) #number of features 
## [1] 24658

Quality Control (QC)

a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT. 
b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data. 
pbmc.m[["percent.mt"]] <- PercentageFeatureSet(pbmc.m, pattern = "^MT-")

# Show QC metrics for the first 5 cells
head(pbmc.m@meta.data, 5)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc.m, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc.m, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 

plot2

# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells. 
pbmc.m <- subset(pbmc.m, subset = nFeature_RNA > 200 & percent.mt < 10)
head(pbmc.m@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343

Normalize the data.

The data is normalized based on the feature (number of genes in a cell) by the total expression. This number is multiplied by 10,000 and then log transformed. The function to do this is “NormalizeData.” The values specied below are the default values of this function.

pbmc.m <- NormalizeData(pbmc.m, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts

Identification of highly variable features

pbmc.m <- FindVariableFeatures(pbmc.m, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc.m), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc.m)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
plot1 

plot2

## Scaling the Data The data is scaled by doing a linear transformation. The ScaleData function does this by shifting the expression of genes so that the mean expression becomes 0 and the variance is 1. By default, only variable genes are scaled. This is changed by features = all.genes

##Scaling RNA data, we only scale the variable features here for efficiency
all.genes <- rownames(pbmc.m)
pbmc.m <- ScaleData(pbmc.m, vars.to.regress = c("percent.mt"))
## Regressing out percent.mt
## Centering and scaling data matrix

Perform linear dimensional reduction

For the first principal components, RunPCA shows the most positive (correlated) and most negative (anticorrelated) genes

pbmc.m <- RunPCA(pbmc.m, features = VariableFeatures(object = pbmc.m))
## PC_ 1 
## Positive:  IL7R, FP236383.3, TCF7, LTB, AQP3, CD27, PIM2, JAML, CTSS, CCR7 
##     C1orf162, NCDN, FP671120.4, SAT1, IGFLR1, TIMP1, SNED1, FTL, SESN3, MYC 
##     CCR4, PASK, LINC00861, AC139720.1, CYSLTR1, BEX3, MAL, FAAH2, ACP5, HERPUD1 
## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1, CDK1, TOP2A, NUSAP1, UBE2C, KIFC1 
##     PCLAF, ZWINT, ASF1B, TUBB, KNL1, CENPF, ASPM, CCNA2, TPX2, HIST1H1B 
##     PKMYT1, GTSE1, H2AFX, GGH, SPC25, CDCA2, CDCA8, CKAP2L, HJURP, AURKB 
## PC_ 2 
## Positive:  CTSW, PRF1, KLRD1, NKG7, GZMB, TYROBP, CCL4, CCL3, KLRC1, TRDC 
##     FCER1G, PLEK, CST7, IL2RB, AOAH, CCL5, FCGR3A, NCAM1, GNLY, KIR2DL4 
##     GZMA, SRGN, EOMES, TOX, KLRK1, MATK, CD300A, FGR, GSTP1, GZMK 
## Negative:  IL7R, TIMP1, AQP3, S100A6, ANP32B, SNED1, COTL1, RGCC, MYC, IL9R 
##     STAT1, TCF7, PDE3B, SELL, MAL, WDR86-AS1, SOS1, HSF4, INPP4B, LMNA 
##     NME4, FHIT, BEX3, CCR4, CRIP1, KLF3, PRDX1, IMPDH2, LTB, HMGA1 
## PC_ 3 
## Positive:  PLK1, CCNB1, KIF20A, DLGAP5, CENPA, PSRC1, ASPM, CDC20, CCNB2, HMMR 
##     KIF14, NEK2, CENPF, TROAP, ARL6IP1, CENPE, PIMREG, UBE2C, PIF1, CDCA8 
##     TOP2A, AURKA, CDCA3, KIF2C, KNSTRN, TPX2, KPNA2, BIRC5, KIF23, GTSE1 
## Negative:  GINS2, MCM2, MCM6, MCM5, CDC6, MCM7, MCM3, MCM4, PAICS, UNG 
##     CDC45, HSP90AB1, E2F1, DCTPP1, SLBP, CDCA7, CHAF1B, NME1, DTL, PPP1R14B 
##     FEN1, HELLS, CCNE2, CDK4, MIF, MSH6, SRM, DUT, C1QBP, POLD2 
## PC_ 4 
## Positive:  FCER1G, CCR7, TCF7, TXK, DHRS3, NCAM1, PTGDR, BACH2, SH2D1B, GSTM2 
##     SYK, KIR2DL4, CXXC5, KLRF1, ID3, RNF130, XCL2, NDFIP2, CYSLTR1, CTBP2 
##     LDB2, RAMP1, XCL1, TOX2, TYROBP, AFAP1L2, NCR1, ACTN1, ADGRG3, PTMA 
## Negative:  S100A6, HLA-DPA1, HLA-DRB1, LAG3, CD74, LYAR, LGALS1, HLA-DQA1, HLA-DRA, GZMH 
##     HLA-DPB1, FGFBP2, TRDV2, LINC02694, HPGD, THEMIS, FLNA, HOPX, AHNAK, MSC 
##     TRGV9, ANXA2, CCL5, NKG7, CST7, ALOX5AP, HLA-DQB1, CXCR3, LGALS3, LTB 
## PC_ 5 
## Positive:  IL9R, LGALS1, TNFRSF18, IL2RA, TNFRSF4, TIMP1, CAPG, SOS1, TNFSF10, ANXA2 
##     LMNA, IL17RB, GATA3, TXN, GAB2, TPM4, NDFIP2, LAT2, CSF2, ACTG1 
##     CD82, PRDX1, HSPA5, AHI1, IL3RA, SNED1, RBPJ, CCNB1, NQO1, TBXAS1 
## Negative:  GZMH, LINC00861, FGFBP2, GZMK, TRDV2, CCR7, C1orf21, TRGV9, KLRC4, TCF7 
##     PLEK, FAM111B, LYAR, CX3CR1, LINC02446, PCNA, LAG3, CXCR4, PRSS23, MYBL2 
##     CCL5, MCM7, CTNNAL1, PKMYT1, CLSPN, LIG1, TK1, CDT1, DUT, DTL
print(pbmc.m[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  IL7R, FP236383.3, TCF7, LTB, AQP3 
## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1 
## PC_ 2 
## Positive:  CTSW, PRF1, KLRD1, NKG7, GZMB 
## Negative:  IL7R, TIMP1, AQP3, S100A6, ANP32B 
## PC_ 3 
## Positive:  PLK1, CCNB1, KIF20A, DLGAP5, CENPA 
## Negative:  GINS2, MCM2, MCM6, MCM5, CDC6 
## PC_ 4 
## Positive:  FCER1G, CCR7, TCF7, TXK, DHRS3 
## Negative:  S100A6, HLA-DPA1, HLA-DRB1, LAG3, CD74 
## PC_ 5 
## Positive:  IL9R, LGALS1, TNFRSF18, IL2RA, TNFRSF4 
## Negative:  GZMH, LINC00861, FGFBP2, GZMK, TRDV2

The PCA results can be visualized in different ways.

VizDimLoadings(pbmc.m, dims = 1:2, reduction = "pca")

DimPlot(pbmc.m, reduction = "pca") + NoLegend()

DimHeatmap(pbmc.m, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc.m, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset

Cells will be clustered based on PCA. How many PC to use is dependent on many factors. For example, if trying to analyze a rare cell subset, you might want to add more PCs. Usually, the first 10 is good to see dimensionality of the data.

ElbowPlot(pbmc.m)

Cluster the cells.

##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
pbmc.m <- FindNeighbors(pbmc.m, reduction = "pca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
pbmc.m <- FindClusters(pbmc.m, resolution = 0.5, verbose = FALSE)
head(Idents(pbmc.m), 5)
## AAACCTGAGCGATATA-1 AAACCTGAGCGTAGTG-1 AAACCTGAGGCAGGTT-1 AAACCTGAGTCTTGCA-1 
##                  2                  2                  1                  2 
## AAACCTGAGTGCCATT-1 
##                  9 
## Levels: 0 1 2 3 4 5 6 7 8 9 10

Run non-linear dimensional reduction (UMAP/tSNE)

pbmc.m <- RunUMAP(pbmc.m, dims = 1:15)
## 10:56:03 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:56:03 Read 18706 rows and found 15 numeric columns
## 10:56:03 Using Annoy for neighbor search, n_neighbors = 30
## 10:56:03 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:56:05 Writing NN index file to temp file /tmp/RtmpLV7FNl/filedf1f675edde4
## 10:56:05 Searching Annoy index using 1 thread, search_k = 3000
## 10:56:11 Annoy recall = 100%
## 10:56:12 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:56:13 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:56:13 Commencing optimization for 200 epochs, with 794186 positive edges
## 10:56:36 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc.m, reduction = "umap", label = TRUE)

pbmc.m <- RunTSNE(pbmc.m, reduction = "pca", dims = 1:15)
DimPlot(pbmc.m, reduction = "tsne", label = TRUE)

Lets check our metadata now of the seurat object to see what has been added.

head(pbmc.m@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
##                    RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGCGATATA-1               2               2
## AAACCTGAGCGTAGTG-1               2               2
## AAACCTGAGGCAGGTT-1               1               1
## AAACCTGAGTCTTGCA-1               2               2
## AAACCTGAGTGCCATT-1               9               9
## AAACCTGCAACACCTA-1               0               0
#We now have seurat clusters and RNA_snn_res.0.5 columns added. 

Finding differentially expressed features (cluster biomarkers)

# find markers for every cluster compared to all remaining cells, report only the positive
# ones
markers_m <- FindAllMarkers(pbmc.m, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10

VlnPlot will display the differential expression across the clusters. For example, I am looking here at CD8A and CD4 expression in the clusters.

VlnPlot(pbmc.m, features = c("CD8A", "CD4"))

The raw counts can also be shown instead by adding some parameters.

VlnPlot(pbmc.m, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)

FeaturePlot(pbmc.m, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))

Combine Seurat object with demuxlet output.

First, load and edit the .best demuxlet output file to make it more compatible.

.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))

library(patchwork)
library(tidyr)
m_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aM.best", header = T, stringsAsFactors = F, check.names = F)
head(m_demuxlet)
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP
## 1 AAACCTGAGCGATATA-1  108714   15696   12405  5336
## 2 AAACCTGAGCGTAGTG-1   32770    4639    4030  1659
## 3 AAACCTGAGGCAGGTT-1   17692    2237    1768  1172
## 4 AAACCTGAGTCTTGCA-1   50476    6748    5402  2761
## 5 AAACCTGAGTGCCATT-1   44258    6139    4879  2498
## 6 AAACCTGCAACACCTA-1    2652     251     207   163
##                                                BEST             SNG.1ST
## 1 DBL-GSA8_0_NYUMD0327-01-GSA8_0_NYUMD0334-01-0.500 GSA8_0_NYUMD0334-01
## 2                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 3                           SNG-GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0327-01
## 4                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 5                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 6                           SNG-GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0327-01
##     SNG.LLK1             SNG.2ND   SNG.LLK2   SNG.LLK0             DBL.1ST
## 1 -3624.4315 GSA8_0_NYUMD0327-01 -4035.0095 -3409.1657 GSA8_0_NYUMD0327-01
## 2  -592.3739 GSA8_0_NYUMD0289-01 -1650.8367  -986.3261 GSA8_0_NYUMD0289-01
## 3  -419.2268 GSA8_0_NYUMD0289-01  -965.4518  -628.2189 GSA8_0_NYUMD0315-01
## 4  -925.1157 GSA8_0_NYUMD0289-01 -2551.6585 -1551.8233 GSA8_0_NYUMD0315-01
## 5  -860.0781 GSA8_0_NYUMD0334-01 -2364.5970 -1443.7875 GSA8_0_NYUMD0315-01
## 6   -55.5857 GSA8_0_NYUMD0315-01  -164.0652  -104.5352 GSA8_0_NYUMD0315-01
##               DBL.2ND ALPHA      LLK12       LLK1       LLK2      LLK10
## 1 GSA8_0_NYUMD0334-01   0.5 -2581.7203 -4035.0095 -3624.4315 -3643.5020
## 2 GSA8_0_NYUMD0315-01   0.5  -933.2834 -1650.8367  -592.3739 -1677.4911
## 3 GSA8_0_NYUMD0327-01   0.5  -564.1035  -994.3223  -419.2268  -840.8888
## 4 GSA8_0_NYUMD0327-01   0.5 -1447.4708  -925.1157 -2570.0140 -1455.0858
## 5 GSA8_0_NYUMD0334-01   0.5 -1359.2728  -860.0781 -2364.5970 -1390.1505
## 6 GSA8_0_NYUMD0327-01   0.5   -82.1285  -164.0652   -55.5857  -141.7246
##        LLK20      LLK00   PRB.DBL PRB.SNG1
## 1 -3476.9482 -3089.7874  1.00e+00      NaN
## 2  -933.2834 -1028.3060 2.94e-149        1
## 3  -565.5310  -627.3371  4.98e-64        1
## 4 -2278.0753 -1610.0639 4.65e-228        1
## 5 -2162.2396 -1504.2012 5.32e-218        1
## 6   -86.4408  -103.7608  1.00e-12        1

To edit: I will split the Best column into multiple columns.

m_demuxlet_edit = m_demuxlet %>% 
  mutate(BEST = gsub("-01","", BEST)) %>%
  separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
  separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
  separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
  select(-contains("garbage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 2409 rows [1, 8, 13,
## 16, 17, 20, 43, 48, 63, 69, 73, 81, 95, 112, 114, 129, 136, 141, 172, 202,
## ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 16725 rows [2,
## 3, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 18, 19, 21, 22, 23, 24, 25, 26, ...].
head(m_demuxlet_edit)
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
## 1 AAACCTGAGCGATATA-1  108714   15696   12405  5336                       DBL
## 2 AAACCTGAGCGTAGTG-1   32770    4639    4030  1659                       SNG
## 3 AAACCTGAGGCAGGTT-1   17692    2237    1768  1172                       SNG
## 4 AAACCTGAGTCTTGCA-1   50476    6748    5402  2761                       SNG
## 5 AAACCTGAGTGCCATT-1   44258    6139    4879  2498                       SNG
## 6 AAACCTGCAACACCTA-1    2652     251     207   163                       SNG
##   DMX_maxID DMX_secondID             SNG.1ST   SNG.LLK1             SNG.2ND
## 1 NYUMD0327    NYUMD0334 GSA8_0_NYUMD0334-01 -3624.4315 GSA8_0_NYUMD0327-01
## 2 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -592.3739 GSA8_0_NYUMD0289-01
## 3 NYUMD0327         <NA> GSA8_0_NYUMD0327-01  -419.2268 GSA8_0_NYUMD0289-01
## 4 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -925.1157 GSA8_0_NYUMD0289-01
## 5 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -860.0781 GSA8_0_NYUMD0334-01
## 6 NYUMD0327         <NA> GSA8_0_NYUMD0327-01   -55.5857 GSA8_0_NYUMD0315-01
##     SNG.LLK2   SNG.LLK0             DBL.1ST             DBL.2ND ALPHA
## 1 -4035.0095 -3409.1657 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5
## 2 -1650.8367  -986.3261 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01   0.5
## 3  -965.4518  -628.2189 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
## 4 -2551.6585 -1551.8233 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
## 5 -2364.5970 -1443.7875 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5
## 6  -164.0652  -104.5352 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
##        LLK12       LLK1       LLK2      LLK10      LLK20      LLK00   PRB.DBL
## 1 -2581.7203 -4035.0095 -3624.4315 -3643.5020 -3476.9482 -3089.7874  1.00e+00
## 2  -933.2834 -1650.8367  -592.3739 -1677.4911  -933.2834 -1028.3060 2.94e-149
## 3  -564.1035  -994.3223  -419.2268  -840.8888  -565.5310  -627.3371  4.98e-64
## 4 -1447.4708  -925.1157 -2570.0140 -1455.0858 -2278.0753 -1610.0639 4.65e-228
## 5 -1359.2728  -860.0781 -2364.5970 -1390.1505 -2162.2396 -1504.2012 5.32e-218
## 6   -82.1285  -164.0652   -55.5857  -141.7246   -86.4408  -103.7608  1.00e-12
##   PRB.SNG1
## 1      NaN
## 2        1
## 3        1
## 4        1
## 5        1
## 6        1
table(m_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
## 
##   AMB   DBL   SNG 
##    10  2399 16725
table(m_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
## 
## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334 
##      4972      4523      6209      3430
table(m_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
##                          DMX_maxID
## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
##                       AMB         2         2         3         3
##                       DBL      1134       787       458        20
##                       SNG      3836      3734      5748      3407

Next, I will add this edited demuxlet to the Seurat object.

m_demuxlet_edit.subset <- m_demuxlet_edit[m_demuxlet_edit$BARCODE %in% colnames(pbmc.m),]

pbmc.m@meta.data <- cbind(pbmc.m@meta.data,m_demuxlet_edit.subset$DMX_maxID,m_demuxlet_edit.subset$DMX_classification.global, m_demuxlet_edit.subset$BARCODE)

head(pbmc.m@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
##                    RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGCGATATA-1               2               2
## AAACCTGAGCGTAGTG-1               2               2
## AAACCTGAGGCAGGTT-1               1               1
## AAACCTGAGTCTTGCA-1               2               2
## AAACCTGAGTGCCATT-1               9               9
## AAACCTGCAACACCTA-1               0               0
##                    m_demuxlet_edit.subset$DMX_maxID
## AAACCTGAGCGATATA-1                        NYUMD0327
## AAACCTGAGCGTAGTG-1                        NYUMD0315
## AAACCTGAGGCAGGTT-1                        NYUMD0327
## AAACCTGAGTCTTGCA-1                        NYUMD0315
## AAACCTGAGTGCCATT-1                        NYUMD0315
## AAACCTGCAACACCTA-1                        NYUMD0327
##                    m_demuxlet_edit.subset$DMX_classification.global
## AAACCTGAGCGATATA-1                                              DBL
## AAACCTGAGCGTAGTG-1                                              SNG
## AAACCTGAGGCAGGTT-1                                              SNG
## AAACCTGAGTCTTGCA-1                                              SNG
## AAACCTGAGTGCCATT-1                                              SNG
## AAACCTGCAACACCTA-1                                              SNG
##                    m_demuxlet_edit.subset$BARCODE
## AAACCTGAGCGATATA-1             AAACCTGAGCGATATA-1
## AAACCTGAGCGTAGTG-1             AAACCTGAGCGTAGTG-1
## AAACCTGAGGCAGGTT-1             AAACCTGAGGCAGGTT-1
## AAACCTGAGTCTTGCA-1             AAACCTGAGTCTTGCA-1
## AAACCTGAGTGCCATT-1             AAACCTGAGTGCCATT-1
## AAACCTGCAACACCTA-1             AAACCTGCAACACCTA-1

Pre-Processing the new object.

The Seurat Object has already been preprocessed in my case, so this should be clean)

pbmc.m[["percent.mt"]] <- PercentageFeatureSet(pbmc.m, pattern = "^MT-")

library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
# look at distribution of metrics by classification
plot_grid(VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "m_demuxlet_edit.subset$DMX_maxID"))

VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "m_demuxlet_edit.subset$DMX_classification.global")

Now, select for MT content under 10% and for nfeatureRNA > 200 to ensure getting alive cells. I previously did this, so should see no change in cell numnbers.

pbmc.m <- subset(pbmc.m, subset = nFeature_RNA > 200 & percent.mt < 10)
length(colnames(pbmc.m))
## [1] 18706
length(rownames(pbmc.m))
## [1] 24658

Add column to meta data to identify seurat object as Basline condition. Also rename some columns for clarity purposes.

pbmc.m@meta.data$condition <- 'Monomer'
names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$BARCODE"] <- "Barcode"
head(pbmc.m@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1     pbmc_m      36569         7002   3.040827
## AAACCTGAGCGTAGTG-1     pbmc_m      13800         3631   2.471014
## AAACCTGAGGCAGGTT-1     pbmc_m       6063         2532   3.842982
## AAACCTGAGTCTTGCA-1     pbmc_m      16873         4662   4.065667
## AAACCTGAGTGCCATT-1     pbmc_m      15430         4615   3.000648
## AAACCTGCAACACCTA-1     pbmc_m        670          460   3.731343
##                    RNA_snn_res.0.5 seurat_clusters DMX_maxID
## AAACCTGAGCGATATA-1               2               2 NYUMD0327
## AAACCTGAGCGTAGTG-1               2               2 NYUMD0315
## AAACCTGAGGCAGGTT-1               1               1 NYUMD0327
## AAACCTGAGTCTTGCA-1               2               2 NYUMD0315
## AAACCTGAGTGCCATT-1               9               9 NYUMD0315
## AAACCTGCAACACCTA-1               0               0 NYUMD0327
##                    DMX_classification.global            Barcode condition
## AAACCTGAGCGATATA-1                       DBL AAACCTGAGCGATATA-1   Monomer
## AAACCTGAGCGTAGTG-1                       SNG AAACCTGAGCGTAGTG-1   Monomer
## AAACCTGAGGCAGGTT-1                       SNG AAACCTGAGGCAGGTT-1   Monomer
## AAACCTGAGTCTTGCA-1                       SNG AAACCTGAGTCTTGCA-1   Monomer
## AAACCTGAGTGCCATT-1                       SNG AAACCTGAGTGCCATT-1   Monomer
## AAACCTGCAACACCTA-1                       SNG AAACCTGCAACACCTA-1   Monomer
saveRDS(pbmc.m, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.monomer.final.RDS")